********************************************************************************
* DESCRIPTION: Function to find strongly or weakly connected set of employers.
*
* INPUT:       - Global macros (see below), and 3 arguments corresponding to variable names in loaded dataset: date, variable name for employer ID, variable name for person ID.
*
* OUTPUT:      - Data in memory containing IDs of employers in connected set (and category indicator, if requested by setting ${connect_by_categ} == 1).
*
********************************************************************************
********************************************************************************
* MAIN CODE
********************************************************************************
*** check global macros and arguments
* make sure the following globals are defined:
foreach g in ///
	connect_by_categ /// 0 = create connected set for populuation; 1 = create connected set separately by category
	connect_strong /// 0 = create weakly connected set; 1 = create strongly connected set
	ext /// file name extension (numeric date and time)
	TEMP_DIR ///
	MATLABPATH ///
	DO_DIR ///
	{
	assert "${`g'}" != ""
}


*** rename and save data
* use original variable names from passed arguments to rename variables
rename `1' date
rename `2' id_employer
rename `3' id_worker

* save
sum date, meanonly
local n_lags = `=r(max)' - `=r(min)' // Note: assumes no gaps in years
if $connect_by_categ order id_employer id_worker date ${categ_var}
else order id_employer id_worker date
compress
if $connect_by_categ save "${TEMP_DIR}/temp_fun_connected_${ext}.dta", replace


*** find (weakly or strongly) connected set in MATLAB
forval g = 0/$connect_by_categ {
	if $connect_by_categ {
		if `g' == 0 {
			disp "--> CATEGORY = 1 (male/nohsch)"
			keep if ${categ_var} == 1
		}
		else if `g' == 1 {
			disp "--> CATEGORY = 2 (female/hsch)"
			use if ${categ_var} == 2 using "${TEMP_DIR}/temp_fun_connected_${ext}.dta", clear
			rm "${TEMP_DIR}/temp_fun_connected_${ext}.dta"
		}
		drop ${categ_var}
	}
	
	disp "* set panel"
	xtset id_worker date
	
	disp "* export list of current and previous employer IDs"
	if $connect_strong gen double lag_id_employer = L.id_employer // for strongly connected set, use only E-to-E transitions.
	else { // for weakly connected set, use E-to-E and E-to-U-to-E transitions.
		gen double lag_id_employer = .
		forval l = 1/`n_lags' {
			replace lag_id_employer = L`l'.id_employer if lag_id_employer == .
		}
	}
	keep id_employer lag_id_employer // i.e., drop date id_worker
	keep if !inlist(., id_employer, lag_id_employer) // i.e., drop observations only ever observed at one employer (when constructing weakly connected set), or observations never transitioning E-to-E (when constructing strongly connected set)
	keep if id_employer != lag_id_employer // i.e., drop stayers -- for constructing connected set, without loss of generality drop self-referrals
	bys id_employer lag_id_employer: keep if _n == 1 // for constructing connected set, always keep only one observation per link
	format id_employer lag_id_employer %14.0f
	compress
	export delim id_employer lag_id_employer using "${TEMP_DIR}/connected_input_${ext}.csv", novarnames nolabel datafmt delim(tab) replace
	clear

	disp "* delete earlier output so as to not cause confusion"
	cap rm "${TEMP_DIR}/connected_output_${ext}.txt"

	disp "* call MATLAB via shell"
	set obs 1
	gen byte connect_strong = ${connect_strong}
	gen double ext = ${ext}
	format connect_strong %1.0f
	format ext %12.0f
	local parameters_exist = 1
	cap confirm file "${TEMP_DIR}/parameters_connected.csv"
	if !_rc disp as error "USER WARNING: Parameters file (${TEMP_DIR}/parameters_connected.csv) already exists -- entering sleep loop."
	while `parameters_exist' {
		cap confirm file "${TEMP_DIR}/parameters_connected.csv"
		local parameters_exist = !_rc
		if `parameters_exist' sleep 60000 // sleep for 60s
	}
	compress
	export delim connect_strong ext using "${TEMP_DIR}/parameters_connected.csv", novarnames nolabel datafmt delim(tab) replace
	clear
	!"${MATLABPATH}" -nojvm -r "run ${DO_DIR}/VALUE_1_CONNECTED.m"
	cap confirm file "${TEMP_DIR}/connected_output_${ext}.txt"
	while _rc {
		sleep 10000
		cap confirm file "${TEMP_DIR}/connected_output_${ext}.txt"
	}

	disp "* read MATLAB output"
	import delim using "${TEMP_DIR}/connected_output_${ext}.txt", varnames(1) asdouble clear
	compress
	label var id_employer "Employer ID (deidentified)"
	
	disp "* delete old data files"
	rm "${TEMP_DIR}/connected_input_${ext}.csv"
	rm "${TEMP_DIR}/connected_output_${ext}.txt"
	
	if $connect_by_categ {
		if `g' == 0 save "${TEMP_DIR}/temp_fun_connected_g1_${ext}.dta", replace
		else if `g' == 1 {
			gen byte ${categ_var} = 2
			label var ${categ_var} "Category"
			append using "${TEMP_DIR}/temp_fun_connected_g1_${ext}.dta"
			rm "${TEMP_DIR}/temp_fun_connected_g1_${ext}.dta"
			replace ${categ_var} = 1 if ${categ_var} == .
			label define gen_l 1 "Male/NoHSch" 2 "Female/HSch", replace
			label val ${categ_var} gen_l
		}
	}
}

* restore original variable names
rename id_employer `2'




********************************************************************************
* RETURNING TO MAIN CODE
********************************************************************************
// Note: at this point, memory contains two variables: `2' (employer ID) and category (if $connect_by_categ)
